#importing libraries
import gc
import os
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline
from colorama import Fore, Style, init
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')
# Color printing
# inspired by https://www.kaggle.com/code/ravi20076/sleepstate-eda-baseline
def PrintColor(text:str, color = Fore.BLUE, style = Style.BRIGHT):
"Prints color outputs using colorama using a text F-string";
print(style + color + text + Style.RESET_ALL);
# inspired by https://www.kaggle.com/code/rishabh15virgo/cmi-dss-first-impression-data-understanding-eda
def summarize_dataframe(df):
summary_df = pd.DataFrame(df.dtypes, columns=['dtypes'])
summary_df['missing#'] = df.isna().sum().values*100
summary_df['missing%'] = (df.isna().sum().values*100)/len(df)
summary_df['uniques'] = df.nunique().values
summary_df['first_value'] = df.iloc[0].values
summary_df['last_value'] = df.iloc[len(df)-1].values
summary_df['count'] = df.count().values
#sum['skew'] = df.skew().values
desc = pd.DataFrame(df.describe().T)
summary_df['min'] = desc['min']
summary_df['max'] = desc['max']
summary_df['mean'] = desc['mean']
return summary_df
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
PrintColor(os.path.join(dirname, filename))
This chapter conveys a summary of each data set in the 2024 Data Bowl, a list of key variables to join on, and a description of each variable. The tracking data is provided by the NFL Next Gen Stats team. The pff_missedTackles column in the tackles data is provided by Pro Football Focus. The player tracking in the dataset includes data from Weeks 1-9 of the 2022 NFL season. Data will show the location, speed, and acceleration of all 22 players on the field, along with football location. Additional PFF scouting data and NFL advanced stats such as expected points and win probability are also included.
File descriptions
Game data: The games.csv contains the teams playing in each game. The key variable is gameId.
Play data: The plays.csv file contains play-level information from each game. The key variables are gameId and playId.
Player data: The players.csv file contains player-level information from players that participated in any of the tracking data files. The key variable is nflId.
Tackles data: The tackles.csv file contains player-level tackle information for each game and play. The key variables are gameId, playId, and nflId.
Tracking data: Files tracking_week.csv contain player tracking data from week number. The key variables are gameId, playId, and nflId.
Games Data Inspection
games = pd.read_csv("C:/Users/vinit/Downloads/VIRTI/games.csv")
games.head(5).style.set_caption("Sample of the games data"). \
set_properties(**{'border': '1.3px solid blue',
'color': 'grey'})
| gameId | season | week | gameDate | gameTimeEastern | homeTeamAbbr | visitorTeamAbbr | homeFinalScore | visitorFinalScore | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022090800 | 2022 | 1 | 09/08/2022 | 20:20:00 | LA | BUF | 10 | 31 |
| 1 | 2022091100 | 2022 | 1 | 09/11/2022 | 13:00:00 | ATL | NO | 26 | 27 |
| 2 | 2022091101 | 2022 | 1 | 09/11/2022 | 13:00:00 | CAR | CLE | 24 | 26 |
| 3 | 2022091102 | 2022 | 1 | 09/11/2022 | 13:00:00 | CHI | SF | 19 | 10 |
| 4 | 2022091103 | 2022 | 1 | 09/11/2022 | 13:00:00 | CIN | PIT | 20 | 23 |
games.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 136 entries, 0 to 135 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gameId 136 non-null int64 1 season 136 non-null int64 2 week 136 non-null int64 3 gameDate 136 non-null object 4 gameTimeEastern 136 non-null object 5 homeTeamAbbr 136 non-null object 6 visitorTeamAbbr 136 non-null object 7 homeFinalScore 136 non-null int64 8 visitorFinalScore 136 non-null int64 dtypes: int64(5), object(4) memory usage: 9.7+ KB
summarize_dataframe(games).style.background_gradient(cmap='Purples')
| dtypes | missing# | missing% | uniques | first_value | last_value | count | min | max | mean | |
|---|---|---|---|---|---|---|---|---|---|---|
| gameId | int64 | 0 | 0.000000 | 136 | 2022090800 | 2022110700 | 136 | 2022090800.000000 | 2022110700.000000 | 2022098922.117647 |
| season | int64 | 0 | 0.000000 | 1 | 2022 | 2022 | 136 | 2022.000000 | 2022.000000 | 2022.000000 |
| week | int64 | 0 | 0.000000 | 9 | 1 | 9 | 136 | 1.000000 | 9.000000 | 4.845588 |
| gameDate | object | 0 | 0.000000 | 27 | 09/08/2022 | 11/07/2022 | 136 | nan | nan | nan |
| gameTimeEastern | object | 0 | 0.000000 | 8 | 20:20:00 | 20:15:00 | 136 | nan | nan | nan |
| homeTeamAbbr | object | 0 | 0.000000 | 32 | LA | NO | 136 | nan | nan | nan |
| visitorTeamAbbr | object | 0 | 0.000000 | 32 | BUF | BAL | 136 | nan | nan | nan |
| homeFinalScore | int64 | 0 | 0.000000 | 38 | 10 | 13 | 136 | 3.000000 | 49.000000 | 22.669118 |
| visitorFinalScore | int64 | 0 | 0.000000 | 35 | 31 | 27 | 136 | 0.000000 | 48.000000 | 20.948529 |
The charts show data from 136 games played between September 8, 2022, and November 7, 2022 (spanning 9 weeks of Season 2022). The first 8 characters of the 'gameId' attribute indicate the game date in YYYYMMDD format.
On average, teams scored 22.67 points in home games and 20.95 points in away games. The scores for home games ranged from 3 to 49, while the scores for away games ranged from 0 to 48.
Players Data Inspection
players_path = "C:/Users/vinit/Downloads/VIRTI/players.csv"
players = pd.read_csv(players_path)
players.head(5).style.set_caption("Sample of the players data"). \
set_properties(**{'border': '1.3px solid blue',
'color': 'grey'})
| nflId | height | weight | birthDate | collegeName | position | displayName | |
|---|---|---|---|---|---|---|---|
| 0 | 25511 | 6-4 | 225 | 1977-08-03 | Michigan | QB | Tom Brady |
| 1 | 29550 | 6-4 | 328 | 1982-01-22 | Arkansas | T | Jason Peters |
| 2 | 29851 | 6-2 | 225 | 1983-12-02 | California | QB | Aaron Rodgers |
| 3 | 30842 | 6-6 | 267 | 1984-05-19 | UCLA | TE | Marcedes Lewis |
| 4 | 33084 | 6-4 | 217 | 1985-05-17 | Boston College | QB | Matt Ryan |
players.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1683 entries, 0 to 1682 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 nflId 1683 non-null int64 1 height 1683 non-null object 2 weight 1683 non-null int64 3 birthDate 1204 non-null object 4 collegeName 1683 non-null object 5 position 1683 non-null object 6 displayName 1683 non-null object dtypes: int64(2), object(5) memory usage: 92.2+ KB
summarize_dataframe(players).style.background_gradient(cmap='Purples')
| dtypes | missing# | missing% | uniques | first_value | last_value | count | min | max | mean | |
|---|---|---|---|---|---|---|---|---|---|---|
| nflId | int64 | 0 | 0.000000 | 1683 | 25511 | 55241 | 1683 | 25511.000000 | 55241.000000 | 48221.702317 |
| height | object | 0 | 0.000000 | 16 | 6-4 | 6-2 | 1683 | nan | nan | nan |
| weight | int64 | 0 | 0.000000 | 179 | 225 | 280 | 1683 | 153.000000 | 380.000000 | 245.724302 |
| birthDate | object | 47900 | 28.461081 | 985 | 1977-08-03 | nan | 1204 | nan | nan | nan |
| collegeName | object | 0 | 0.000000 | 226 | Michigan | Coastal Carolina | 1683 | nan | nan | nan |
| position | object | 0 | 0.000000 | 19 | QB | DT | 1683 | nan | nan | nan |
| displayName | object | 0 | 0.000000 | 1672 | Tom Brady | C.J. Brewer | 1683 | nan | nan | nan |
We discovered that:
Plays Data Inspection
plays_path = "C:/Users/vinit/Downloads/VIRTI/plays.csv"
plays = pd.read_csv(plays_path)
plays.head(5).style.set_caption("Sample of the plays data"). \
set_properties(**{'border': '1.3px solid blue',
'color': 'grey'})
| gameId | playId | ballCarrierId | ballCarrierDisplayName | playDescription | quarter | down | yardsToGo | possessionTeam | defensiveTeam | yardlineSide | yardlineNumber | gameClock | preSnapHomeScore | preSnapVisitorScore | passResult | passLength | penaltyYards | prePenaltyPlayResult | playResult | playNullifiedByPenalty | absoluteYardlineNumber | offenseFormation | defendersInTheBox | passProbability | preSnapHomeTeamWinProbability | preSnapVisitorTeamWinProbability | homeTeamWinProbabilityAdded | visitorTeamWinProbilityAdded | expectedPoints | expectedPointsAdded | foulName1 | foulName2 | foulNFLId1 | foulNFLId2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022100908 | 3537 | 48723 | Parker Hesse | (7:52) (Shotgun) M.Mariota pass short middle to P.Hesse to 50 for 9 yards (K.Neal). | 4 | 1 | 10 | ATL | TB | ATL | 41 | 7:52 | 21 | 7 | C | 6.000000 | nan | 9 | 9 | N | 69 | SHOTGUN | 7.000000 | 0.747284 | 0.976785 | 0.023215 | -0.006110 | 0.006110 | 2.360609 | 0.981955 | nan | nan | nan | nan |
| 1 | 2022091103 | 3126 | 52457 | Chase Claypool | (7:38) (Shotgun) C.Claypool right end to PIT 37 for 3 yards (C.Awuzie). | 4 | 1 | 10 | PIT | CIN | PIT | 34 | 7:38 | 14 | 20 | nan | nan | nan | 3 | 3 | N | 76 | SHOTGUN | 7.000000 | 0.416454 | 0.160485 | 0.839515 | -0.010865 | 0.010865 | 1.733344 | -0.263424 | nan | nan | nan | nan |
| 2 | 2022091111 | 1148 | 42547 | Darren Waller | (8:57) D.Carr pass short middle to D.Waller to LV 45 for 15 yards (N.Adderley). | 2 | 2 | 5 | LV | LAC | LV | 30 | 8:57 | 10 | 3 | C | 11.000000 | nan | 15 | 15 | N | 40 | I_FORM | 6.000000 | 0.267933 | 0.756661 | 0.243339 | -0.037409 | 0.037409 | 1.312855 | 1.133666 | nan | nan | nan | nan |
| 3 | 2022100212 | 2007 | 46461 | Mike Boone | (13:12) M.Boone left tackle to DEN 44 for 7 yards (J.Abram; D.Deablo). | 3 | 2 | 10 | DEN | LV | DEN | 37 | 13:12 | 19 | 16 | nan | nan | nan | 7 | 7 | N | 47 | SINGLEBACK | 6.000000 | 0.592704 | 0.620552 | 0.379448 | -0.002451 | 0.002451 | 1.641006 | -0.043580 | nan | nan | nan | nan |
| 4 | 2022091900 | 1372 | 47857 | Devin Singletary | (8:33) D.Singletary right guard to TEN 32 for 3 yards (N.Jones; O.Adeniyi). TEN-O.Adeniyi was injured during the play. His return is Questionable. O.Adeniyi walks off. | 2 | 1 | 10 | BUF | TEN | TEN | 35 | 8:33 | 7 | 7 | nan | nan | nan | 3 | 3 | N | 75 | I_FORM | 7.000000 | 0.470508 | 0.836290 | 0.163710 | 0.001053 | -0.001053 | 3.686428 | -0.167903 | nan | nan | nan | nan |
plays.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12486 entries, 0 to 12485 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gameId 12486 non-null int64 1 playId 12486 non-null int64 2 ballCarrierId 12486 non-null int64 3 ballCarrierDisplayName 12486 non-null object 4 playDescription 12486 non-null object 5 quarter 12486 non-null int64 6 down 12486 non-null int64 7 yardsToGo 12486 non-null int64 8 possessionTeam 12486 non-null object 9 defensiveTeam 12486 non-null object 10 yardlineSide 12319 non-null object 11 yardlineNumber 12486 non-null int64 12 gameClock 12486 non-null object 13 preSnapHomeScore 12486 non-null int64 14 preSnapVisitorScore 12486 non-null int64 15 passResult 6105 non-null object 16 passLength 5634 non-null float64 17 penaltyYards 615 non-null float64 18 prePenaltyPlayResult 12486 non-null int64 19 playResult 12486 non-null int64 20 playNullifiedByPenalty 12486 non-null object 21 absoluteYardlineNumber 12486 non-null int64 22 offenseFormation 12482 non-null object 23 defendersInTheBox 12481 non-null float64 24 passProbability 12149 non-null float64 25 preSnapHomeTeamWinProbability 12486 non-null float64 26 preSnapVisitorTeamWinProbability 12486 non-null float64 27 homeTeamWinProbabilityAdded 12486 non-null float64 28 visitorTeamWinProbilityAdded 12486 non-null float64 29 expectedPoints 12486 non-null float64 30 expectedPointsAdded 12485 non-null float64 31 foulName1 592 non-null object 32 foulName2 25 non-null object 33 foulNFLId1 592 non-null float64 34 foulNFLId2 25 non-null float64 dtypes: float64(12), int64(12), object(11) memory usage: 3.3+ MB
summarize_dataframe(plays).style.background_gradient(cmap='Purples')
| dtypes | missing# | missing% | uniques | first_value | last_value | count | min | max | mean | |
|---|---|---|---|---|---|---|---|---|---|---|
| gameId | int64 | 0 | 0.000000 | 136 | 2022100908 | 2022102308 | 12486 | 2022090800.000000 | 2022110700.000000 | 2022098953.855598 |
| playId | int64 | 0 | 0.000000 | 3974 | 3537 | 905 | 12486 | 54.000000 | 5096.000000 | 1986.603476 |
| ballCarrierId | int64 | 0 | 0.000000 | 480 | 48723 | 54572 | 12486 | 25511.000000 | 55158.000000 | 48072.271664 |
| ballCarrierDisplayName | object | 0 | 0.000000 | 480 | Parker Hesse | Dameon Pierce | 12486 | nan | nan | nan |
| playDescription | object | 0 | 0.000000 | 12486 | (7:52) (Shotgun) M.Mariota pass short middle to P.Hesse to 50 for 9 yards (K.Neal). | (12:51) C.Heck reported in as eligible. D.Pierce up the middle to LV 14 for 3 yards (D.Perryman; D.Deablo). | 12486 | nan | nan | nan |
| quarter | int64 | 0 | 0.000000 | 5 | 4 | 2 | 12486 | 1.000000 | 5.000000 | 2.550136 |
| down | int64 | 0 | 0.000000 | 4 | 1 | 3 | 12486 | 1.000000 | 4.000000 | 1.727054 |
| yardsToGo | int64 | 0 | 0.000000 | 32 | 10 | 1 | 12486 | 1.000000 | 38.000000 | 8.469085 |
| possessionTeam | object | 0 | 0.000000 | 32 | ATL | HOU | 12486 | nan | nan | nan |
| defensiveTeam | object | 0 | 0.000000 | 32 | TB | LV | 12486 | nan | nan | nan |
| yardlineSide | object | 16700 | 1.337498 | 32 | ATL | LV | 12319 | nan | nan | nan |
| yardlineNumber | int64 | 0 | 0.000000 | 50 | 41 | 17 | 12486 | 1.000000 | 50.000000 | 29.573122 |
| gameClock | object | 0 | 0.000000 | 898 | 7:52 | 12:51 | 12486 | nan | nan | nan |
| preSnapHomeScore | int64 | 0 | 0.000000 | 42 | 21 | 3 | 12486 | 0.000000 | 49.000000 | 11.170671 |
| preSnapVisitorScore | int64 | 0 | 0.000000 | 36 | 7 | 0 | 12486 | 0.000000 | 48.000000 | 9.862967 |
| passResult | object | 638100 | 51.105238 | 2 | C | nan | 6105 | nan | nan | nan |
| passLength | float64 | 685200 | 54.877463 | 65 | 6.000000 | nan | 5634 | -9.000000 | 61.000000 | 5.455982 |
| penaltyYards | float64 | 1187100 | 95.074483 | 23 | nan | nan | 615 | -15.000000 | 15.000000 | -2.091057 |
| prePenaltyPlayResult | int64 | 0 | 0.000000 | 86 | 9 | 3 | 12486 | -26.000000 | 98.000000 | 7.582012 |
| playResult | int64 | 0 | 0.000000 | 97 | 9 | 3 | 12486 | -61.000000 | 98.000000 | 7.218565 |
| playNullifiedByPenalty | object | 0 | 0.000000 | 2 | N | N | 12486 | nan | nan | nan |
| absoluteYardlineNumber | int64 | 0 | 0.000000 | 99 | 69 | 27 | 12486 | 11.000000 | 109.000000 | 60.426077 |
| offenseFormation | object | 400 | 0.032036 | 7 | SHOTGUN | I_FORM | 12482 | nan | nan | nan |
| defendersInTheBox | float64 | 500 | 0.040045 | 11 | 7.000000 | 8.000000 | 12481 | 1.000000 | 11.000000 | 6.424085 |
| passProbability | float64 | 33700 | 2.699023 | 11554 | 0.747284 | 0.101552 | 12149 | 0.005607 | 0.996206 | 0.603845 |
| preSnapHomeTeamWinProbability | float64 | 0 | 0.000000 | 12429 | 0.976785 | 0.726639 | 12486 | 0.001049 | 0.999175 | 0.556236 |
| preSnapVisitorTeamWinProbability | float64 | 0 | 0.000000 | 12429 | 0.023215 | 0.273361 | 12486 | 0.000825 | 0.998951 | 0.443764 |
| homeTeamWinProbabilityAdded | float64 | 0 | 0.000000 | 12410 | -0.006110 | -0.015028 | 12486 | -0.521181 | 0.471527 | 0.000110 |
| visitorTeamWinProbilityAdded | float64 | 0 | 0.000000 | 12410 | 0.006110 | 0.015028 | 12486 | -0.471527 | 0.521181 | -0.000110 |
| expectedPoints | float64 | 0 | 0.000000 | 12104 | 2.360609 | 4.685974 | 12486 | -2.366872 | 6.558894 | 2.258360 |
| expectedPointsAdded | float64 | 100 | 0.008009 | 12450 | 0.981955 | 0.251580 | 12485 | -9.986150 | 8.698986 | 0.297367 |
| foulName1 | object | 1189400 | 95.258690 | 25 | nan | nan | 592 | nan | nan | nan |
| foulName2 | object | 1246100 | 99.799776 | 10 | nan | nan | 25 | nan | nan | nan |
| foulNFLId1 | float64 | 1189400 | 95.258690 | 420 | nan | nan | 592 | 33107.000000 | 55157.000000 | 47759.856419 |
| foulNFLId2 | float64 | 1246100 | 99.799776 | 25 | nan | nan | 25 | 43586.000000 | 54650.000000 | 50000.560000 |
We discovered that there are details for 12,486 plays in the dataset, and each play has its own distinct and not empty description.
Tackles Data Inspection
tackles_path = "C:/Users/vinit/Downloads/VIRTI/tackles.csv"
tackles = pd.read_csv(tackles_path)
tackles.head(5).style.set_caption("Sample of the tackles data"). \
set_properties(**{'border': '1.3px solid blue',
'color': 'grey'})
| gameId | playId | nflId | tackle | assist | forcedFumble | pff_missedTackle | |
|---|---|---|---|---|---|---|---|
| 0 | 2022090800 | 101 | 42816 | 1 | 0 | 0 | 0 |
| 1 | 2022090800 | 393 | 46232 | 1 | 0 | 0 | 0 |
| 2 | 2022090800 | 486 | 40166 | 1 | 0 | 0 | 0 |
| 3 | 2022090800 | 646 | 47939 | 1 | 0 | 0 | 0 |
| 4 | 2022090800 | 818 | 40107 | 1 | 0 | 0 | 0 |
tackles.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 17426 entries, 0 to 17425 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gameId 17426 non-null int64 1 playId 17426 non-null int64 2 nflId 17426 non-null int64 3 tackle 17426 non-null int64 4 assist 17426 non-null int64 5 forcedFumble 17426 non-null int64 6 pff_missedTackle 17426 non-null int64 dtypes: int64(7) memory usage: 953.1 KB
summarize_dataframe(tackles).style.background_gradient(cmap='Purples')
| dtypes | missing# | missing% | uniques | first_value | last_value | count | min | max | mean | |
|---|---|---|---|---|---|---|---|---|---|---|
| gameId | int64 | 0 | 0.000000 | 136 | 2022090800 | 2022091901 | 17426 | 2022090800.000000 | 2022110700.000000 | 2022098971.441123 |
| playId | int64 | 0 | 0.000000 | 3943 | 101 | 3845 | 17426 | 54.000000 | 5096.000000 | 1982.974578 |
| nflId | int64 | 0 | 0.000000 | 800 | 42816 | 46199 | 17426 | 33131.000000 | 55241.000000 | 47602.719442 |
| tackle | int64 | 0 | 0.000000 | 2 | 1 | 0 | 17426 | 0.000000 | 1.000000 | 0.569207 |
| assist | int64 | 0 | 0.000000 | 2 | 0 | 0 | 17426 | 0.000000 | 1.000000 | 0.315276 |
| forcedFumble | int64 | 0 | 0.000000 | 2 | 0 | 0 | 17426 | 0.000000 | 1.000000 | 0.005681 |
| pff_missedTackle | int64 | 0 | 0.000000 | 2 | 0 | 1 | 17426 | 0.000000 | 1.000000 | 0.119936 |
We discovered details on 17,426 tackles and attempted tackles in the dataset. Out of the 1,683 players mentioned, 800 were part of these events. In about 56.92% of the tackle records, a player actually made a tackle, and in 31.53% of the records, a player assisted in a tackle.
Games by Dates
games_count = games['gameDate'].value_counts().reset_index()
games_count.columns = ['date', 'games']
games_count = games_count.sort_values('date')
plt.figure(figsize=(6,6))
sns.barplot(data=games_count, y='date', x='games', palette='viridis')
plt.title('Number of games by dates')
plt.xlabel('Number of games')
plt.ylabel('Date')
plt.show()
Most games in the dataset happen on Sundays, like on September 11, 2022, September 18, 2022, and so on.
The largest number of games on a single day, totaling 14, occurred on specific dates: September 11, September 25, October 2, and October 9, 2022.
Games by Weeks
check = games['week'].value_counts().reset_index()
check.columns = ['week', 'games']
check = check.sort_values('week')
fig = px.line( check, x='week',y='games', markers=True, title='Number of games over weeks',
labels={'week': 'Week', 'games': 'Number of Games'})
fig.update_layout( width=600, height=600)
fig.show()
We find that
Home and Away Games by Teams
dfg_home = games['homeTeamAbbr'].value_counts().reset_index().rename(columns={'index': 'team', 'homeTeamAbbr': 'number_of_home_games'})
dfg_away = games['visitorTeamAbbr'].value_counts().reset_index().rename(columns={'index': 'team', 'visitorTeamAbbr': 'number_of_away_games'})
outer_merged = dfg_home.merge(dfg_away, on='team', how='outer')
merged_melted = outer_merged.melt(id_vars='team', var_name='game_type', value_name='number_of_games')
fig = px.bar(merged_melted, x='team', y='number_of_games', color='game_type',
labels={'number_of_games': 'Number of Games', 'game_type': 'Game Type'},
title='Home vs. Visiting Games by Teams',
color_discrete_map={'number_of_home_games': px.colors.qualitative.Prism[0],
'number_of_away_games': px.colors.qualitative.Prism[1]},height=400, width=800,barmode='group')
fig.update_layout(showlegend=True,margin=dict(t=30, b=0, l=5, r=5),template="plotly_white",)
fig.show()
Home Games
agg_dict_1 = {
"min_home_score": pd.NamedAgg(column='homeFinalScore', aggfunc='min'),
"max_home_score": pd.NamedAgg(column='homeFinalScore', aggfunc='max'),
"avg_home_score": pd.NamedAgg(column='homeFinalScore', aggfunc=np.mean),
}
agg_data = games.groupby("homeTeamAbbr").agg(**agg_dict_1).reset_index()
agg_data.head()
| homeTeamAbbr | min_home_score | max_home_score | avg_home_score | |
|---|---|---|---|---|
| 0 | ARI | 12 | 42 | 22.600000 |
| 1 | ATL | 17 | 37 | 26.200000 |
| 2 | BAL | 19 | 38 | 25.000000 |
| 3 | BUF | 27 | 41 | 35.333333 |
| 4 | CAR | 15 | 24 | 19.600000 |
Buffalo Bills Dominance: BUF leads with an impressive home score of 35.33, showcasing dominance.
Detroit Lions Variability: DET shows a wide range (15 to 45) in home scores, indicating variability.
New England Patriots Consistency: NE maintains consistent home performance with a narrow score range.
Las Vegas Raiders Strength: LV excels with a high max home score (38) and a solid average of 31.
Minnesota Vikings Excellence: MIN demonstrates excellence at home with an average score of 28.5.
Kansas City Chiefs Balance: KC displays balanced home performance, suggesting stability.
San Francisco 49ers Consistency: SF shows consistent home performances with a narrow score range.
Arizona Cardinals Competitiveness: ARI is competitive with a respectable average home score of 22.6.
Tampa Bay Buccaneers Variation: TB has varied home performances, indicating a mix of strengths and challenges.
League-Wide Diversity: The data showcases diverse home performances, highlighting unique team strengths.
Away Game
agg_dict_2 = {
"min_away_score": pd.NamedAgg(column='visitorFinalScore', aggfunc='min'),
"max_away_score": pd.NamedAgg(column='visitorFinalScore', aggfunc='max'),
"avg_away_score": pd.NamedAgg(column='visitorFinalScore', aggfunc=np.mean)
}
agg_data = games.groupby("visitorTeamAbbr").agg(**agg_dict_2).reset_index()
agg_data.head()
| visitorTeamAbbr | min_away_score | max_away_score | avg_away_score | |
|---|---|---|---|---|
| 0 | ARI | 9 | 29 | 22.50 |
| 1 | ATL | 15 | 27 | 21.50 |
| 2 | BAL | 20 | 37 | 27.00 |
| 3 | BUF | 17 | 31 | 22.80 |
| 4 | CAR | 10 | 34 | 20.25 |
Kansas City Chiefs Dominance: KC stands out with an impressive average away score of 36.5, showcasing road dominance.
Buffalo Bills Consistency: BUF maintains strong away performance, with an average score of 22.8, highlighting consistency.
San Francisco 49ers Versatility: SF displays adaptability with a solid average away score of 20.4, showcasing versatility.
New Orleans Saints Balance: NO strikes a balance with a well-rounded average away score of 25, indicating stability.
Los Angeles Chargers Strength: LAC excels on the road, boasting an impressive average away score of 27, showcasing strength.
Seattle Seahawks High-Scoring: SEA stands out for high-scoring away games, with an average of 31, highlighting offensive prowess.
Jacksonville Jaguars Resilience: JAX shows road resilience, with an average away score of 27, indicating the ability to compete.
New England Patriots Consistency: NE maintains consistency on the road, with an average away score of 21.6.
Tampa Bay Buccaneers Variation: TB displays varied away performances, with an average score of 15, suggesting a mix of challenges and successes.
Green Bay Packers Efficiency: GB exhibits road efficiency with a decent average away score of 13.6.
players.head().style.set_caption("Sample of the Player data"). \
set_properties(**{'border': '1.3px solid blue',
'color': 'grey'})
| nflId | height | weight | birthDate | collegeName | position | displayName | |
|---|---|---|---|---|---|---|---|
| 0 | 25511 | 6-4 | 225 | 1977-08-03 | Michigan | QB | Tom Brady |
| 1 | 29550 | 6-4 | 328 | 1982-01-22 | Arkansas | T | Jason Peters |
| 2 | 29851 | 6-2 | 225 | 1983-12-02 | California | QB | Aaron Rodgers |
| 3 | 30842 | 6-6 | 267 | 1984-05-19 | UCLA | TE | Marcedes Lewis |
| 4 | 33084 | 6-4 | 217 | 1985-05-17 | Boston College | QB | Matt Ryan |
Player Data Transformation and Feature Engineering Before delving into the data analytics, let's do a quick player data preprocessing and feature engineering:
# cast height to the metric system - namely, in m
new_players = players['height'].str.split('-',expand=True)
new_players.columns=['Foot','Inch']
new_players['Foot'] = new_players['Foot'].astype(int)
new_players['Inch'] = new_players['Inch'].astype(int)
new_players['Foot'] = new_players['Foot'] * 30.48
new_players['Inch'] = new_players['Inch'] * 2.54
new_players['height_cm'] = new_players['Foot'] + new_players['Inch']
players['height'] = new_players['height_cm'] / 100
# cast the weights to metric units - namely, to kg
players['weight'] = players['weight'] * 0.45
formats = ["%Y-%m-%d", "%m/%d/%Y"]
parsed_dt = pd.to_datetime(players.birthDate, format=formats[0], errors='coerce')
for format in formats[1:]:
parsed_dt = parsed_dt.fillna(pd.to_datetime(players.birthDate, format=format, errors='coerce'))
players['birthDate'] = parsed_dt
def group_position(position):
if position in ['QB', 'T', 'TE', 'WR', 'C', 'G', 'RB', 'FB']:
return 'offensive'
return 'defensive'
players['position_group'] = players['position'].apply(group_position)
players.height = players.height.round(2)
players.weight = players.weight.round(2)
# formula inherited from https://www.calculatorsoup.com/calculators/health/bmi-calculator.php
players['mass_index'] = players['weight']/( players['height'] * players['height'] )
players.mass_index = players.mass_index.round(2)
def group_index(index):
bmi_lower_fence = 21.64
bmi_q1 = 27.18
bmi_median = 29.87
bmi_q3 = 34.53
if index < bmi_q1:
return 'S'
elif bmi_q1 <= index < bmi_median:
return 'M'
elif bmi_median <= index < bmi_q3:
return 'L'
else:
return 'XL'
players['bmi_group'] = players['mass_index'].apply(group_index)
today = dt.datetime.today()
players['age'] = players['birthDate'].apply(
lambda x: today.year - x.year -
((today.month, today.day) < (x.month, x.day))
)
players.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1683 entries, 0 to 1682 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 nflId 1683 non-null int64 1 height 1683 non-null float64 2 weight 1683 non-null float64 3 birthDate 1204 non-null datetime64[ns] 4 collegeName 1683 non-null object 5 position 1683 non-null object 6 displayName 1683 non-null object 7 position_group 1683 non-null object 8 mass_index 1683 non-null float64 9 bmi_group 1683 non-null object 10 age 1204 non-null float64 dtypes: datetime64[ns](1), float64(4), int64(1), object(5) memory usage: 144.8+ KB
del new_players
# Returns the number of
# objects it has collected
# and deallocated
collected = gc.collect()
# Prints Garbage collector
# as 0 object
print("Garbage collector: collected",
"%d objects." % collected)# Importing gc module
Garbage collector: collected 1150 objects.
players.head().style.set_caption("Sample of the Player data (after transformation and FE)"). \
set_properties(**{'border': '1.3px solid blue',
'color': 'grey'})
| nflId | height | weight | birthDate | collegeName | position | displayName | position_group | mass_index | bmi_group | age | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25511 | 1.930000 | 101.250000 | 1977-08-03 00:00:00 | Michigan | QB | Tom Brady | offensive | 27.180000 | M | 46.000000 |
| 1 | 29550 | 1.930000 | 147.600000 | 1982-01-22 00:00:00 | Arkansas | T | Jason Peters | offensive | 39.630000 | XL | 41.000000 |
| 2 | 29851 | 1.880000 | 101.250000 | 1983-12-02 00:00:00 | California | QB | Aaron Rodgers | offensive | 28.650000 | M | 40.000000 |
| 3 | 30842 | 1.980000 | 120.150000 | 1984-05-19 00:00:00 | UCLA | TE | Marcedes Lewis | offensive | 30.650000 | L | 39.000000 |
| 4 | 33084 | 1.930000 | 97.650000 | 1985-05-17 00:00:00 | Boston College | QB | Matt Ryan | offensive | 26.220000 | S | 38.000000 |
Players by Age
fig = px.box(players, x="age",
title='Summary Statistics on Players by Age',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.update_layout( width=600, height=600)
fig.show()
Age of players grouped by defensive/offensive positions
fig = px.box(players, x="position_group", y="age",
title='Summary Statistics on Players by Age and Offensive/Defensive Position',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.update_layout( width=600, height=600)
fig.show()
There is no notable distinction in age distribution between players in offensive and defensive positions. Additionally, it is noteworthy that offensive positions exhibit a higher proportion of older players, as indicated by the 'outliers' in the diagram above.
Average age of players in different colleges.
avg_age = players.groupby(['collegeName'])['age'].mean()
frame = {'collegeName': avg_age.index,
'avg_player_age': avg_age}
dfg_agg = pd.DataFrame(frame)
dfg_agg = dfg_agg.sort_values('avg_player_age', ascending=True)
dfg_agg
| collegeName | avg_player_age | |
|---|---|---|
| collegeName | ||
| St. John's, Minn. | St. John's, Minn. | 25.0 |
| Norfolk State | Norfolk State | 25.0 |
| Wagner | Wagner | 25.0 |
| Berry | Berry | 25.0 |
| Missouri S&T | Missouri S&T | 25.0 |
| ... | ... | ... |
| Northwest Missouri State | Northwest Missouri State | NaN |
| Sam Houston State | Sam Houston State | NaN |
| Texas A&M-Commerce | Texas A&M-Commerce | NaN |
| Tusculum | Tusculum | NaN |
| West Florida | West Florida | NaN |
226 rows × 2 columns
import plotly.express as px
fig = px.bar(
dfg_agg,
y='collegeName',
x="avg_player_age",
orientation='h',
title='Average Age of Players in College Teams',
height=600,
width=600,
color='avg_player_age', # Color based on average age
color_continuous_scale='Viridis', # Choose a color scale
labels={'avg_player_age': 'Average Age'},
template='plotly_white', # Use a white background template
)
fig.show()
Observations:
fig = px.box(players, x="mass_index",
title='Summary Statistics on Players by Body Mass Index (BMI)',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.update_layout( width=600, height=600)
fig.show()
Observations and Insights Regarding Player Health in the NFL:
By integrating these strategies, teams can enhance player well-being and potentially improve overall game performance.
BMI of players grouped by defensive/offensive positions
fig = px.box(players, x="position_group", y="mass_index",
title='Summary Statistics on Players by BMI and Offensive/Defensive Position',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.update_layout( width=600, height=600)
fig.show()
We can see slight difference in the BMI summary statistics for different position groups.
Average BMI for players who study in different colleges
avg_bmi = players.groupby(['collegeName'])['mass_index'].mean()
frame = {'collegeName': avg_bmi.index,
'avg_player_bmi': avg_bmi}
dfg_agg = pd.DataFrame(frame)
dfg_agg = dfg_agg.sort_values('avg_player_bmi', ascending=True)
fig = px.bar(
dfg_agg,
y='collegeName',
x="avg_player_bmi",
orientation='h',
title='Average BMI of players by the college graduated',
height=600,
width=600,
color_discrete_sequence=px.colors.qualitative.Prism,)
fig.show()
We see that
agg_data = players[["position_group", "bmi_group"]].groupby(["position_group", "bmi_group"]).size().reset_index(name="count")
# define figure element
fig = px.sunburst(
agg_data,
values='count',
path=["position_group", "bmi_group"],
title="BMI Group-to-Player Position Group Association",
color="position_group",
height=600,
color_discrete_sequence=px.colors.qualitative.Prism,
#color_continuous_scale=px.colors.sequential.Viridis
)
# display the figure
fig.show()
We infer that
Tackles per Game
tackles.head().style.set_caption("Sample of the Tackle data"). \
set_properties(**{'border': '1.3px solid blue',
'color': 'grey'})
tackle_games = tackles.groupby(['gameId'])['tackle'].agg('sum')
frame = {'game_id': tackle_games.index,
'tackles': tackle_games}
tackle_games = tackles.groupby(['gameId'])['tackle'].agg('sum')
frame = {'game_id': tackle_games.index,
'tackles': tackle_games}
# Creating DataFrame by passing Dictionary
agg_data = pd.DataFrame(frame).reset_index()
agg_data = agg_data.drop(columns=['game_id'], axis=1)
fig = px.box(agg_data, x="tackles",
title='Summary Statistics on Tackles per Game',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.show()
We see that the distribution of the number of tackles per game is skewed to the right, and the majority of the games had between 65 and 79 tackles per a game.
Top 10 games in terms of the number of tackles per game
agg_data.sort_values(['tackles'],
ascending=False)[
['gameId',
'tackles',
]
][:10].style.background_gradient(cmap='seismic')
| gameId | tackles | |
|---|---|---|
| 114 | 2022103005 | 94 |
| 6 | 2022091105 | 91 |
| 37 | 2022092504 | 91 |
| 93 | 2022101700 | 90 |
| 106 | 2022102311 | 89 |
| 67 | 2022100902 | 88 |
| 5 | 2022091104 | 86 |
| 104 | 2022102309 | 86 |
| 110 | 2022103001 | 86 |
| 76 | 2022100911 | 85 |
Bottom 10 games in terms of the number of tackles per game.
agg_data.sort_values(['tackles'],
ascending=True)[
[
'gameId',
'tackles',
]
][:10].style.background_gradient(cmap='seismic')
| gameId | tackles | |
|---|---|---|
| 25 | 2022091808 | 51 |
| 55 | 2022100206 | 53 |
| 21 | 2022091804 | 57 |
| 130 | 2022110606 | 58 |
| 75 | 2022100910 | 58 |
| 101 | 2022102306 | 59 |
| 119 | 2022103010 | 60 |
| 13 | 2022091112 | 60 |
| 2 | 2022091101 | 60 |
| 23 | 2022091806 | 61 |
As we can see,
Tackles by players
tackle_players = tackles.groupby(['nflId'])['tackle'].agg('sum')
frame = {'nfl_id': tackle_players.index,
'tackles': tackle_players}
# Creating DataFrame by passing Dictionary
agg_data = pd.DataFrame(frame).reset_index()
agg_data = agg_data.drop(columns=['nfl_id'], axis=1)
fig = px.histogram(agg_data, x="tackles", marginal="rug",
title='Distribution of Tackles by Players',
color_discrete_sequence=px.colors.qualitative.Prism)
fig = px.box(agg_data, x="tackles",
title='Summary Statistics on Tackles by Players',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.show()
We find that
Assists per Game
assist_games = tackles.groupby(['gameId'])['assist'].agg('sum')
frame = {'game_id': assist_games.index,
'assists': assist_games}
# Creating DataFrame by passing Dictionary
agg_data = pd.DataFrame(frame).reset_index()
agg_data = agg_data.drop(columns=['game_id'], axis=1)
fig = px.histogram(agg_data, x="assists", marginal="rug",
title='Distribution of Assists per Game',
color_discrete_sequence=px.colors.qualitative.Prism)
fig = px.box(agg_data, x="assists",
title='Summary Statistics on Assists per Game',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.update_layout( width=600, height=600)
fig.show()
We find that
Top 10 games in terms of the number of assists per a game
agg_data.sort_values(['assists'],
ascending=False)[['gameId','assists',]][:10].style.background_gradient(cmap='seismic')
| gameId | assists | |
|---|---|---|
| 101 | 2022102306 | 71 |
| 59 | 2022100210 | 62 |
| 127 | 2022110603 | 60 |
| 24 | 2022091807 | 59 |
| 124 | 2022110600 | 58 |
| 23 | 2022091806 | 56 |
| 21 | 2022091804 | 55 |
| 28 | 2022091811 | 55 |
| 35 | 2022092502 | 54 |
| 64 | 2022100600 | 54 |
From the table above, it is easy to locate the 'outlier' game with 71 assists - namely, it is the game with gameId = '2022102306'. It would be interesting to look at the 'telemetry' (tracking data) for this game in more details, to see the possible hidden patterns behind assists committed.
Bottom 10 games in terms of the number of assists per game
agg_data.sort_values(['assists'],
ascending=True)[
[
'gameId',
'assists',
]
][:10].style.background_gradient(cmap='seismic')
| gameId | assists | |
|---|---|---|
| 25 | 2022091808 | 18 |
| 114 | 2022103005 | 20 |
| 20 | 2022091803 | 21 |
| 30 | 2022091900 | 22 |
| 104 | 2022102309 | 23 |
| 3 | 2022091102 | 24 |
| 125 | 2022110601 | 28 |
| 103 | 2022102308 | 28 |
| 65 | 2022100900 | 29 |
| 120 | 2022103011 | 29 |
As we can infer, the game with the smallest number of assists is the game with gameId = '2022091808' (only 18 assists committed).
forced_fumble_games = tackles.groupby(['gameId'])['forcedFumble'].agg('sum')
frame = {'game_id': forced_fumble_games.index,
'forced_fumbles': forced_fumble_games}
# Creating DataFrame by passing Dictionary
agg_data = pd.DataFrame(frame).reset_index()
agg_data = agg_data.drop(columns=['game_id'], axis=1)
fig = px.histogram(agg_data, x="forced_fumbles", marginal="rug",
title='Distribution of Forced Fumbles per Game',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.update_layout( width=600, height=600)
fig.show()
We find that
Top 10 games in terms of the forced fumble events
agg_data.sort_values(['forced_fumbles'],
ascending=False)[
[
'gameId',
'forced_fumbles',
]
][:10].style.background_gradient(cmap='seismic')
| gameId | forced_fumbles | |
|---|---|---|
| 45 | 2022092512 | 4 |
| 15 | 2022091200 | 4 |
| 11 | 2022091110 | 4 |
| 1 | 2022091100 | 3 |
| 38 | 2022092505 | 3 |
| 46 | 2022092513 | 3 |
| 8 | 2022091107 | 3 |
| 71 | 2022100906 | 2 |
| 127 | 2022110603 | 2 |
| 98 | 2022102303 | 2 |
Missed Tackles per Game
missed_tackles_games = tackles.groupby(['gameId'])['pff_missedTackle'].agg('sum')
frame = {'game_id': missed_tackles_games.index,
'missed_tackles': missed_tackles_games}
# Creating DataFrame by passing Dictionary
agg_data = pd.DataFrame(frame).reset_index()
agg_data = agg_data.drop(columns=['game_id'], axis=1)
fig = px.histogram(agg_data, x="missed_tackles", marginal="rug",
title='Distribution of Missed Tackles per Game',
color_discrete_sequence=px.colors.qualitative.Prism)
fig = px.box(agg_data, x="missed_tackles",
title='Summary Statistics on Missed Tackles per Game',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.show()
We find that
tackle_players = tackles.groupby(['nflId'])['tackle'].agg('sum')
frame = {'nfl_id': tackle_players.index,
'tackles': tackle_players}
# Creating DataFrame by passing Dictionary
agg_data = pd.DataFrame(frame).reset_index()
agg_data = agg_data.drop(columns=['nfl_id'], axis=1)
fig = px.histogram(agg_data, x="tackles", marginal="rug",
title='Distribution of Tackles by Players',
color_discrete_sequence=px.colors.qualitative.Prism)
fig = px.box(agg_data, x="tackles",
title='Summary Statistics on Tackles by Players',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.update_layout( width=600, height=600)
fig.show()
We find that
player_names = players[['nflId', 'displayName', 'position']]
agg_data = pd.merge(
agg_data,
player_names,
how="inner",
on=['nflId']
)
agg_data.sort_values(['tackles'],
ascending=False)[
[
'nflId',
'displayName',
'position',
'tackles',
]
][:30].style.background_gradient(cmap='seismic')
| nflId | displayName | position | tackles | |
|---|---|---|---|---|
| 492 | 52435 | Jordyn Brooks | ILB | 61 |
| 334 | 46269 | Foyesade Oluokun | ILB | 59 |
| 337 | 46304 | Zaire Franklin | OLB | 55 |
| 63 | 41243 | C.J. Mosley | ILB | 55 |
| 106 | 42427 | Jordan Hicks | ILB | 54 |
| 606 | 53489 | Pete Werner | OLB | 51 |
| 270 | 46077 | Roquan Smith | ILB | 50 |
| 472 | 48516 | T.J. Edwards | ILB | 49 |
| 414 | 47872 | Bobby Okereke | ILB | 48 |
| 605 | 53487 | Nick Bolton | ILB | 46 |
| 237 | 44925 | Eddie Jackson | SS | 45 |
| 358 | 46669 | Jonathan Owens | FS | 45 |
| 279 | 46091 | Rashaan Evans | ILB | 45 |
| 209 | 44848 | Budda Baker | SS | 44 |
| 425 | 47913 | Drue Tranquill | ILB | 43 |
| 531 | 52527 | Mykal Walker | ILB | 43 |
| 482 | 49410 | Jalen Thompson | FS | 43 |
| 695 | 54500 | Roger McCreary | CB | 42 |
| 97 | 42388 | Eric Kendricks | ILB | 41 |
| 446 | 47996 | Donovan Wilson | SS | 41 |
| 143 | 43325 | Myles Jack | ILB | 41 |
| 538 | 52546 | L'Jarius Sneed | CB | 40 |
| 596 | 53465 | Jevon Holland | FS | 40 |
| 29 | 38577 | Bobby Wagner | ILB | 40 |
| 263 | 45345 | Nicholas Morrow | OLB | 40 |
| 521 | 52497 | Cameron Dantzler | CB | 39 |
| 598 | 53469 | Richie Grant | SS | 39 |
| 568 | 52852 | Myles Hartsfield | FS | 39 |
| 584 | 53445 | Zaven Collins | OLB | 39 |
| 332 | 46259 | DeShon Elliott | FS | 39 |
As we can see, the players in the top 30 tackle performers list are distributed by their game positions as follows
Players in the top 10 'tackler' list, in turn, play in ILB and OLB positions solely.
assist_players = tackles.groupby(['nflId'])['assist'].agg('sum')
frame = {'nfl_id': assist_players.index,
'assists': assist_players}
# Creating DataFrame by passing Dictionary
agg_data = pd.DataFrame(frame).reset_index()
agg_data = agg_data.drop(columns=['nfl_id'], axis=1)
fig = px.histogram(agg_data, x="assists", marginal="rug",
title='Distribution of Assists by Players',
color_discrete_sequence=px.colors.qualitative.Prism)
fig = px.box(agg_data, x="assists",
title='Summary Statistics on Assists by Players',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.update_layout( width=600, height=600)
fig.show()
We find that
agg_data = pd.merge(
agg_data,
player_names,
how="inner",
on=['nflId']
)
agg_data.sort_values(['assists'],
ascending=False)[
[
'nflId',
'displayName',
'position',
'assists',
]
][:30].style.background_gradient(cmap='seismic')
| nflId | displayName | position | assists | |
|---|---|---|---|---|
| 279 | 46091 | Rashaan Evans | ILB | 39 |
| 617 | 53509 | Divine Deablo | OLB | 34 |
| 270 | 46077 | Roquan Smith | ILB | 32 |
| 690 | 54492 | Devin Lloyd | ILB | 32 |
| 63 | 41243 | C.J. Mosley | ILB | 32 |
| 439 | 47956 | Cole Holcomb | ILB | 32 |
| 322 | 46212 | Ja'Whaun Bentley | ILB | 31 |
| 407 | 47855 | Germaine Pratt | OLB | 31 |
| 276 | 46086 | Derwin James | FS | 30 |
| 337 | 46304 | Zaire Franklin | OLB | 28 |
| 93 | 42368 | Shaq Thompson | ILB | 28 |
| 380 | 47788 | Devin White | ILB | 27 |
| 165 | 43404 | De'Vondre Campbell | ILB | 26 |
| 143 | 43325 | Myles Jack | ILB | 26 |
| 441 | 47971 | David Long | ILB | 26 |
| 179 | 43503 | Elandon Roberts | ILB | 26 |
| 209 | 44848 | Budda Baker | SS | 26 |
| 334 | 46269 | Foyesade Oluokun | ILB | 26 |
| 531 | 52527 | Mykal Walker | ILB | 25 |
| 278 | 46088 | Leighton Vander Esch | OLB | 25 |
| 472 | 48516 | T.J. Edwards | ILB | 25 |
| 622 | 53532 | Ernest Jones | ILB | 24 |
| 129 | 42929 | Alex Singleton | OLB | 24 |
| 97 | 42388 | Eric Kendricks | ILB | 24 |
| 227 | 44888 | Alex Anzalone | ILB | 24 |
| 584 | 53445 | Zaven Collins | OLB | 24 |
| 492 | 52435 | Jordyn Brooks | ILB | 23 |
| 422 | 47891 | Julian Love | SS | 23 |
| 605 | 53487 | Nick Bolton | ILB | 23 |
| 275 | 46085 | Tremaine Edmunds | ILB | 23 |
As we can see, the players in the top 30 'assist performers' list are distributed by their game positions as follows
In turn, the players in the top 10 list of 'assist performers' play in ILB, OLB, and FS positions.
This is a straightforward metric, but it's a good indicator of a defensive player's ability to change the course of a game. More forced fumbles generally mean a player is good at creating turnover opportunities.
forced_fumble_players = tackles.groupby(['nflId'])['forcedFumble'].agg('sum')
frame = {'nfl_id': forced_fumble_players.index,
'forced_fumbles': forced_fumble_players}
# Creating DataFrame by passing Dictionary
agg_data = pd.DataFrame(frame).reset_index()
agg_data = agg_data.drop(columns=['nfl_id'], axis=1)
fig = px.histogram(agg_data, x="forced_fumbles", marginal="rug",
title='Distribution of Forced Fumbles by Players',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.show()
We can see that
agg_data = pd.merge(
agg_data,
player_names,
how="inner",
on=['nflId']
)
agg_data.sort_values(['forced_fumbles'],
ascending=False)[
[
'nflId',
'displayName',
'position',
'forced_fumbles',
]
][:10].style.background_gradient(cmap='seismic')
| nflId | displayName | position | forced_fumbles | |
|---|---|---|---|---|
| 728 | 54574 | Coby Bryant | CB | 4 |
| 366 | 46711 | Ryan Neal | SS | 2 |
| 687 | 54487 | Quay Walker | ILB | 2 |
| 237 | 44925 | Eddie Jackson | SS | 2 |
| 606 | 53489 | Pete Werner | OLB | 2 |
| 645 | 53592 | Darrick Forrest | SS | 2 |
| 723 | 54562 | Kerby Joseph | SS | 2 |
| 187 | 43700 | Jonathan Jones | CB | 2 |
| 603 | 53481 | Jeremiah Owusu-Koramoah | OLB | 2 |
| 507 | 52469 | Kristian Fulton | CB | 1 |
Data Preprocessing
agg_dict_1 = {
"tackles": pd.NamedAgg(column='tackle', aggfunc='sum'),
"assists": pd.NamedAgg(column='assist', aggfunc='sum'),
"forced_fumbles": pd.NamedAgg(column='forcedFumble', aggfunc='sum'),
"missed_tackles": pd.NamedAgg(column='pff_missedTackle', aggfunc='sum'),
}
player_tackle_df = tackles.groupby("nflId").agg(**agg_dict_1).reset_index()
# calculate player tackle efficiency
# inspired by https://www.kaggle.com/code/sasakitetsuya/1st-step-data-summary-and-understanding
player_tackle_df['total_tackles'] = player_tackle_df['tackles'] + player_tackle_df['assists']
player_tackle_df['tackle_efficiency'] = player_tackle_df['total_tackles'] / (player_tackle_df['total_tackles'] + player_tackle_df['missed_tackles'])
player_tackles = pd.merge(
players,
player_tackle_df,
how="inner",
on=['nflId']
)
player_tackles.head().style.set_caption("Sample of the Player data with tackle stats"). \
set_properties(**{'border': '1.3px solid blue',
'color': 'grey'})
| nflId | height | weight | birthDate | collegeName | position | displayName | position_group | mass_index | bmi_group | age | tackles | assists | forced_fumbles | missed_tackles | total_tackles | tackle_efficiency | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 33131 | 2.030000 | 135.000000 | 1986-09-01 00:00:00 | Miami | DE | Calais Campbell | defensive | 32.760000 | L | 37.000000 | 8 | 10 | 0 | 4 | 18 | 0.818182 |
| 1 | 35449 | 1.900000 | 136.800000 | 1987-05-12 00:00:00 | California | NT | Tyson Alualu | defensive | 37.890000 | XL | 36.000000 | 4 | 4 | 0 | 1 | 8 | 0.888889 |
| 2 | 35452 | 1.880000 | 119.250000 | 1988-04-03 00:00:00 | Michigan | DE | Brandon Graham | defensive | 33.740000 | L | 35.000000 | 6 | 6 | 0 | 1 | 12 | 0.923077 |
| 3 | 35454 | 1.960000 | 123.750000 | 1989-01-01 00:00:00 | South Florida | DE | Jason Pierre-Paul | defensive | 32.210000 | L | 35.000000 | 3 | 3 | 0 | 1 | 6 | 0.857143 |
| 4 | 35459 | 1.780000 | 83.250000 | 1988-04-10 00:00:00 | Alabama | SS | Kareem Jackson | defensive | 26.280000 | S | 35.000000 | 28 | 23 | 0 | 9 | 51 | 0.850000 |
del player_tackle_df
# Returns the number of
# objects it has collected
# and deallocated
collected = gc.collect()
# Prints Garbage collector
# as 0 object
print("Garbage collector: collected",
"%d objects." % collected)# Importing gc module
Garbage collector: collected 9560 objects.
Players-to-Tackles Numeric Variable Correlations
sns.set_theme(style="white")
# make a dataframe with only numberic variables, without 'Outcome'
d = player_tackles[['height', 'weight', 'mass_index', 'age', 'tackle_efficiency',
'tackles', 'assists', 'forced_fumbles', 'missed_tackles']]
# Compute the correlation matrix for numeric features
corr = d.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, annot=True,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
<Axes: >
As we can see, there are no strong correlations between the player characteristics and tackle-related parameters. However, we can see medium-level correlations as per the list below
Tackles by Player Positions
Let's do a bivariative analysis of the relations between the player positions and player-to-tackle-related numeric variables. Note: We are going to investigate defensive positions only since only players in the defensive roles commit tackles.
fig = make_subplots(rows=5, cols=1, subplot_titles=(
'<b>Summary Statistics on Players by Tackles and Defensive Roles</b>',
'<b>Summary Statistics on Players by Assists and Defensive Roles</b>',
'<b>Summary Statistics on Players by Forced Fumbles and Defensive Roles</b>',
'<b>Summary Statistics on Players by Missed Tackles and Defensive Roles</b>',
'<b>Summary Statistics on Players by Tackle Efficiency and Defensive Roles</b>',
))
fig.add_trace(go.Box(x=player_tackles['position'],y=player_tackles['tackles']), row=1, col=1)
fig.add_trace(go.Box(x=player_tackles['position'],y=player_tackles['assists']), row=2, col=1)
fig.add_trace(go.Box(x=player_tackles['position'],y=player_tackles['forced_fumbles']), row=3, col=1)
fig.add_trace(go.Box(x=player_tackles['position'],y=player_tackles['missed_tackles']), row=4, col=1)
fig.add_trace(go.Box(x=player_tackles['position'],y=player_tackles['tackle_efficiency']), row=5, col=1)
# Update visual layout
fig.update_layout(
showlegend=False,
width=900,
height=600,
autosize=False,
margin=dict(t=15, b=0, l=5, r=5),
template="plotly_white",
colorway=px.colors.qualitative.Prism ,
)
# update font size at the axes
fig.update_coloraxes(colorbar_tickfont_size=10)
# Update font in the titles: Apparently subplot titles are annotations (Subplot font size is hardcoded to 16pt · Issue #985)
fig.update_annotations(font_size=12)
# Reduce opacity
fig.update_traces(opacity=0.75)
fig.show()
We find that:
The biggest average number of tackles and assists is committed by players in SS (Strong Safety), ILB (Inside Linebacker), and FS (Free Safety) positions
Some of the players in OLB (Outside Linebacker), DT (Defensive Tackle), DE (Defensive End), and CB (Cornerback) positions do quite large number of tackles and assists (they are indicated as outlisers in the chart above)
Players in MLB (Middle Linebackeer) position are in the middle of the rank in terms of the number of tackles and assists
Players in NT (Nose Tackle) and DB (Defensive Back) positions are in the bottom of the rank in terms of the number of tackles and assists
It is interesting to match it to the positions of the players in the top 30 'tackle performers' (see details above). We find that
The number of the players in OLB position is exceptionally high, in the top 30 list
In turn, the players in FS and SS positions are 'under-represented', in the top 30 list
The highest median tackle efficiency is observed for players in MLB (0.9329), FS (0.92), NT (0.91667), ILB (0.91107), and DT (0.9091) positions
fig = make_subplots(rows=5, cols=1, subplot_titles=(
'<b>Summary Statistics on Players by Tackles and BMI Group</b>',
'<b>Summary Statistics on Players by Assists and BMI Group</b>',
'<b>Summary Statistics on Players by Forced Fumbles and BMI Group</b>',
'<b>Summary Statistics on Players by Missed Tackles and BMI Group</b>',
'<b>Summary Statistics on Players by Tackle Efficiency and BMI Group</b>',
))
fig.add_trace(go.Box(x=player_tackles['bmi_group'],y=player_tackles['tackles']), row=1, col=1)
fig.add_trace(go.Box(x=player_tackles['bmi_group'],y=player_tackles['assists']), row=2, col=1)
fig.add_trace(go.Box(x=player_tackles['bmi_group'],y=player_tackles['forced_fumbles']), row=3, col=1)
fig.add_trace(go.Box(x=player_tackles['bmi_group'],y=player_tackles['missed_tackles']), row=4, col=1)
fig.add_trace(go.Box(x=player_tackles['bmi_group'],y=player_tackles['tackle_efficiency']), row=5, col=1)
# Update visual layout
fig.update_layout(
showlegend=False,
width=900,
height=600,
autosize=False,
margin=dict(t=15, b=0, l=5, r=5),
template="plotly_white",
colorway=px.colors.qualitative.Prism ,
)
# update font size at the axes
fig.update_coloraxes(colorbar_tickfont_size=10)
# Update font in the titles: Apparently subplot titles are annotations (Subplot font size is hardcoded to 16pt · Issue #985)
fig.update_annotations(font_size=12)
# Reduce opacity
fig.update_traces(opacity=0.75)
fig.show()
We find that:
Average tackle Efficiency
agg_dict_2 = {
"tackle_efficiency": pd.NamedAgg(column='tackle_efficiency', aggfunc='mean'),
"total_graduates": pd.NamedAgg(column='tackle_efficiency', aggfunc='count'),
}
college_tackle_efficiency_df = player_tackles.groupby("collegeName").agg(**agg_dict_2).reset_index()
college_tackle_efficiency_df = college_tackle_efficiency_df.sort_values('tackle_efficiency', ascending=True)
fig = px.bar(
college_tackle_efficiency_df,
y='collegeName',
x="tackle_efficiency",
orientation='h',
title='Average tackle_efficiency of players by the college graduated',
height=900,
width=800,
color_discrete_sequence=px.colors.qualitative.Prism,
)
fig.show()
Top 10 colleges by the average tackle efficiency of their graduates
college_tackle_efficiency_df.sort_values(['tackle_efficiency'],
ascending=False)[
[
'collegeName',
'tackle_efficiency',
'total_graduates'
]
][:10].style.background_gradient(cmap='seismic')
| collegeName | tackle_efficiency | total_graduates | |
|---|---|---|---|
| 93 | North Carolina A&T | 1.000000 | 1 |
| 123 | South Alabama | 1.000000 | 1 |
| 107 | Old Dominion | 1.000000 | 1 |
| 111 | Pennsylvania | 1.000000 | 2 |
| 101 | Northwest Missouri State | 1.000000 | 1 |
| 91 | Norfolk State | 1.000000 | 2 |
| 113 | Prairie View | 1.000000 | 1 |
| 8 | Army | 1.000000 | 1 |
| 17 | California, Pa. | 1.000000 | 1 |
| 42 | Framingham State | 1.000000 | 1 |
Top 10 colleges by the number of their graduates among the players, factored in the player efficiency
college_tackle_efficiency_df.sort_values(['total_graduates', 'tackle_efficiency'],
ascending=False)[
[
'collegeName',
'tackle_efficiency',
'total_graduates'
]
][:10].style.background_gradient(cmap='seismic')
| collegeName | tackle_efficiency | total_graduates | |
|---|---|---|---|
| 0 | Alabama | 0.888278 | 29 |
| 65 | Louisiana State | 0.879506 | 25 |
| 104 | Ohio State | 0.904669 | 24 |
| 76 | Michigan | 0.908371 | 17 |
| 44 | Georgia | 0.844284 | 17 |
| 57 | Iowa | 0.854481 | 16 |
| 36 | Florida | 0.847639 | 16 |
| 110 | Penn State | 0.800743 | 16 |
| 103 | Notre Dame | 0.906731 | 15 |
| 161 | Washington | 0.892252 | 15 |
Based on the charts above, we can conclude that:
Tackling Efficiency and Tackle Factor by Defensive Positions
agg_dict_3 = {
"avg_tackles_by_pos": pd.NamedAgg(column='total_tackles', aggfunc='mean'),
}
position_tackles_df = player_tackles.groupby("position").agg(**agg_dict_3).reset_index()
def set_average_tackles_by_pos(position):
avg_tackles = position_tackles_df.loc[
position_tackles_df['position'] == position, 'avg_tackles_by_pos'].iloc[0]
return avg_tackles
player_tackles['avg_tackles_by_pos'] = player_tackles['position'].apply(set_average_tackles_by_pos)
player_tackles['tackle_factor'] = player_tackles['total_tackles']/player_tackles['avg_tackles_by_pos']
player_tackles = player_tackles.drop(columns=['avg_tackles_by_pos'])
player_tackles.head(5).style.set_caption("Sample of the Player data with tackle factor stats"). \
set_properties(**{'border': '1.3px solid blue',
'color': 'grey'})
fig = px.scatter(player_tackles,
x="tackle_factor", y="tackle_efficiency", color='position',
hover_data=['nflId',
'displayName',],
title='Tackling Efficiency and Tackle Factor by Position',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.show()
It has been demonstrated that,
player_tackles.sort_values(['tackle_factor'],
ascending=False)[
[
'nflId',
'displayName',
'position',
'tackle_efficiency',
'tackle_factor',
]
][:20].style.background_gradient(cmap='seismic')
| nflId | displayName | position | tackle_efficiency | tackle_factor | |
|---|---|---|---|---|---|
| 337 | 46304 | Zaire Franklin | OLB | 0.873684 | 5.080859 |
| 617 | 53509 | Divine Deablo | OLB | 0.912500 | 4.468707 |
| 606 | 53489 | Pete Werner | OLB | 0.901235 | 4.468707 |
| 584 | 53445 | Zaven Collins | OLB | 0.828947 | 3.856556 |
| 129 | 42929 | Alex Singleton | OLB | 0.885714 | 3.795340 |
| 485 | 52415 | Derrick Brown | DE | 0.954545 | 3.676220 |
| 244 | 44957 | Grover Stewart | DT | 0.937500 | 3.630000 |
| 278 | 46088 | Leighton Vander Esch | OLB | 0.951613 | 3.611695 |
| 263 | 45345 | Nicholas Morrow | OLB | 0.855072 | 3.611695 |
| 420 | 47889 | Maxx Crosby | DE | 0.833333 | 3.501162 |
| 407 | 47855 | Germaine Pratt | OLB | 0.890625 | 3.489265 |
| 70 | 41300 | Christian Kirksey | OLB | 0.900000 | 3.305619 |
| 385 | 47796 | Christian Wilkins | DT | 0.952381 | 3.226667 |
| 417 | 47881 | Quincy Williams | OLB | 0.842105 | 2.938328 |
| 300 | 46146 | Sam Hubbard | DE | 0.868421 | 2.888459 |
| 538 | 52546 | L'Jarius Sneed | CB | 0.900000 | 2.880873 |
| 405 | 47848 | Zach Allen | DE | 0.969697 | 2.800930 |
| 148 | 43335 | A'Shawn Robinson | DT | 0.918919 | 2.742667 |
| 66 | 41263 | Demarcus Lawrence | DE | 0.861111 | 2.713400 |
| 14 | 37097 | Cameron Jordan | DE | 0.837838 | 2.713400 |
Tackling Efficiency and Tackle Factor by BMI Group
fig = px.scatter(player_tackles,
x="tackle_factor", y="tackle_efficiency", color='bmi_group',
hover_data=['nflId',
'displayName', 'position'],
title='Tackling Efficiency and Tackle Factor by BMI Group',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.show()
It has been observed that,
Tackles by Offensive Formation
agg_dict_1 = {
"tackles": pd.NamedAgg(column='tackle', aggfunc='sum'),
"assists": pd.NamedAgg(column='assist', aggfunc='sum'),
"forced_fumbles": pd.NamedAgg(column='forcedFumble', aggfunc='sum'),
"missed_tackles": pd.NamedAgg(column='pff_missedTackle', aggfunc='sum'),
}
play_tackle_df = tackles.groupby(["gameId", "playId"]).agg(**agg_dict_1).reset_index()
play_tackles = pd.merge(
plays,
play_tackle_df,
how="outer",
on=["gameId", "playId"]
)
play_tackles.head().style.set_caption("Sample of the Play data with tackle stats"). \
set_properties(**{'border': '1.3px solid blue',
'color': 'grey'})
| gameId | playId | ballCarrierId | ballCarrierDisplayName | playDescription | quarter | down | yardsToGo | possessionTeam | defensiveTeam | yardlineSide | yardlineNumber | gameClock | preSnapHomeScore | preSnapVisitorScore | passResult | passLength | penaltyYards | prePenaltyPlayResult | playResult | playNullifiedByPenalty | absoluteYardlineNumber | offenseFormation | defendersInTheBox | passProbability | preSnapHomeTeamWinProbability | preSnapVisitorTeamWinProbability | homeTeamWinProbabilityAdded | visitorTeamWinProbilityAdded | expectedPoints | expectedPointsAdded | foulName1 | foulName2 | foulNFLId1 | foulNFLId2 | tackles | assists | forced_fumbles | missed_tackles | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022100908 | 3537 | 48723 | Parker Hesse | (7:52) (Shotgun) M.Mariota pass short middle to P.Hesse to 50 for 9 yards (K.Neal). | 4 | 1 | 10 | ATL | TB | ATL | 41 | 7:52 | 21 | 7 | C | 6.000000 | nan | 9 | 9 | N | 69 | SHOTGUN | 7.000000 | 0.747284 | 0.976785 | 0.023215 | -0.006110 | 0.006110 | 2.360609 | 0.981955 | nan | nan | nan | nan | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1 | 2022091103 | 3126 | 52457 | Chase Claypool | (7:38) (Shotgun) C.Claypool right end to PIT 37 for 3 yards (C.Awuzie). | 4 | 1 | 10 | PIT | CIN | PIT | 34 | 7:38 | 14 | 20 | nan | nan | nan | 3 | 3 | N | 76 | SHOTGUN | 7.000000 | 0.416454 | 0.160485 | 0.839515 | -0.010865 | 0.010865 | 1.733344 | -0.263424 | nan | nan | nan | nan | 1.000000 | 0.000000 | 0.000000 | 1.000000 |
| 2 | 2022091111 | 1148 | 42547 | Darren Waller | (8:57) D.Carr pass short middle to D.Waller to LV 45 for 15 yards (N.Adderley). | 2 | 2 | 5 | LV | LAC | LV | 30 | 8:57 | 10 | 3 | C | 11.000000 | nan | 15 | 15 | N | 40 | I_FORM | 6.000000 | 0.267933 | 0.756661 | 0.243339 | -0.037409 | 0.037409 | 1.312855 | 1.133666 | nan | nan | nan | nan | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3 | 2022100212 | 2007 | 46461 | Mike Boone | (13:12) M.Boone left tackle to DEN 44 for 7 yards (J.Abram; D.Deablo). | 3 | 2 | 10 | DEN | LV | DEN | 37 | 13:12 | 19 | 16 | nan | nan | nan | 7 | 7 | N | 47 | SINGLEBACK | 6.000000 | 0.592704 | 0.620552 | 0.379448 | -0.002451 | 0.002451 | 1.641006 | -0.043580 | nan | nan | nan | nan | 0.000000 | 2.000000 | 0.000000 | 2.000000 |
| 4 | 2022091900 | 1372 | 47857 | Devin Singletary | (8:33) D.Singletary right guard to TEN 32 for 3 yards (N.Jones; O.Adeniyi). TEN-O.Adeniyi was injured during the play. His return is Questionable. O.Adeniyi walks off. | 2 | 1 | 10 | BUF | TEN | TEN | 35 | 8:33 | 7 | 7 | nan | nan | nan | 3 | 3 | N | 75 | I_FORM | 7.000000 | 0.470508 | 0.836290 | 0.163710 | 0.001053 | -0.001053 | 3.686428 | -0.167903 | nan | nan | nan | nan | 0.000000 | 2.000000 | 0.000000 | 1.000000 |
values = {"tackles": -1.0, "assists": -1.0, "forced_fumbles": -1.0, "missed_tackles": -1.0}
play_tackles = play_tackles.fillna(value=values)
play_tackles = play_tackles.astype({"tackles":'int', "assists":'int',
"forced_fumbles":'int', "missed_tackles":'int',})
play_tackles.info()
ctt = pd.crosstab(play_tackles['offenseFormation'], play_tackles['tackles'])
ctt.columns = ['No tackles',
'Unsuccessful tackles',
'Successful tackles',
'2 successful tackles']
<class 'pandas.core.frame.DataFrame'> Int64Index: 12486 entries, 0 to 12485 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gameId 12486 non-null int64 1 playId 12486 non-null int64 2 ballCarrierId 12486 non-null int64 3 ballCarrierDisplayName 12486 non-null object 4 playDescription 12486 non-null object 5 quarter 12486 non-null int64 6 down 12486 non-null int64 7 yardsToGo 12486 non-null int64 8 possessionTeam 12486 non-null object 9 defensiveTeam 12486 non-null object 10 yardlineSide 12319 non-null object 11 yardlineNumber 12486 non-null int64 12 gameClock 12486 non-null object 13 preSnapHomeScore 12486 non-null int64 14 preSnapVisitorScore 12486 non-null int64 15 passResult 6105 non-null object 16 passLength 5634 non-null float64 17 penaltyYards 615 non-null float64 18 prePenaltyPlayResult 12486 non-null int64 19 playResult 12486 non-null int64 20 playNullifiedByPenalty 12486 non-null object 21 absoluteYardlineNumber 12486 non-null int64 22 offenseFormation 12482 non-null object 23 defendersInTheBox 12481 non-null float64 24 passProbability 12149 non-null float64 25 preSnapHomeTeamWinProbability 12486 non-null float64 26 preSnapVisitorTeamWinProbability 12486 non-null float64 27 homeTeamWinProbabilityAdded 12486 non-null float64 28 visitorTeamWinProbilityAdded 12486 non-null float64 29 expectedPoints 12486 non-null float64 30 expectedPointsAdded 12485 non-null float64 31 foulName1 592 non-null object 32 foulName2 25 non-null object 33 foulNFLId1 592 non-null float64 34 foulNFLId2 25 non-null float64 35 tackles 12486 non-null int32 36 assists 12486 non-null int32 37 forced_fumbles 12486 non-null int32 38 missed_tackles 12486 non-null int32 dtypes: float64(12), int32(4), int64(12), object(11) memory usage: 3.6+ MB
# Heatmap
heatmap_trace = go.Heatmap(
z=ctt,
x=ctt.columns,
y=ctt.index,
colorscale='purples',
)
# Create figure and add trace
fig = go.Figure(heatmap_trace)
# Update layout
fig.update_layout(title='Heatmap of Successful/Unsuccessful tackles by Offense Formation')
# Display the figure
fig.show()
We find that
Team Tackling Performance
# inspired by https://www.kaggle.com/code/sasakitetsuya/1st-step-data-summary-and-understanding
# Merge tackles data with plays data to get the defensive team for each play
team_tackles_df = tackles.merge(plays[['gameId', 'playId', 'defensiveTeam']], on=['gameId', 'playId'])
# Calculate total tackles and missed tackles for each team
team_performance = team_tackles_df.groupby('defensiveTeam').agg(
total_tackles=('tackle', 'sum'),
total_assists=('assist', 'sum'),
missed_tackles=('pff_missedTackle', 'sum')
).reset_index()
team_performance['total_tackles_and_assists'] = team_performance['total_tackles'] + team_performance['total_assists']
# Sort teams by total tackles in descending order
team_performance_sorted = team_performance.sort_values(by='total_tackles_and_assists', ascending=False)
fig = go.Figure()
fig.add_bar(x=team_performance_sorted['defensiveTeam'],y=team_performance_sorted['total_tackles_and_assists'],
hovertext="Number of total tackles and assists", name="number_of_total_tackles_and_assists",
marker_color=px.colors.qualitative.Prism[0])
fig.add_bar(x=team_performance_sorted['defensiveTeam'],y=team_performance_sorted['total_tackles'],
hovertext="Number of total tackles", name="number_of_total_tackles",
marker_color=px.colors.qualitative.Prism[1])
fig.add_bar(x=team_performance_sorted['defensiveTeam'],y=team_performance_sorted['total_assists'],
hovertext="Number of assists in tackles", name="number_of_assists",
marker_color=px.colors.qualitative.Prism[2])
fig.add_bar(x=team_performance_sorted['defensiveTeam'],y=team_performance_sorted['missed_tackles'],
hovertext="Number of missed tackles", name="number_of_missed_tackles",
marker_color=px.colors.qualitative.Prism[3])
# Update visual layout
fig.update_layout(
showlegend=True,
width=800,
height=400,
autosize=False,
margin=dict(t=30, b=0, l=5, r=5),
template="plotly_white",
yaxis_title="Number of tackles",
title=dict(text='Total vs. Missing Tackles by Teams', font=dict(size=20), yref='paper')
)
# update font size at the axes
fig.update_coloraxes(colorbar_tickfont_size=10)
# Update font in the titles: Apparently subplot titles are annotations (Subplot font size is hardcoded to 16pt · Issue #985)
fig.update_annotations(font_size=12)
# Reduce opacity
fig.update_traces(opacity=0.75)
fig.show()
We can see that
However, it's essential to consider both total tackles, assists, and missed tackles when evaluating a team's defensive performance.
# inspired by https://www.kaggle.com/code/sasakitetsuya/1st-step-data-summary-and-understanding
# Calculate win counts for home and visitor teams
home_wins = games[games['homeFinalScore'] > games['visitorFinalScore']].groupby('homeTeamAbbr').size().reset_index(name='home_wins')
visitor_wins = games[games['visitorFinalScore'] > games['homeFinalScore']].groupby('visitorTeamAbbr').size().reset_index(name='visitor_wins')
# Merge win counts and calculate total wins for each team
team_wins = home_wins.merge(visitor_wins, left_on='homeTeamAbbr', right_on='visitorTeamAbbr', how='outer').fillna(0)
team_wins['total_wins'] = team_wins['home_wins'] + team_wins['visitor_wins']
# Calculate total games played by each team
total_games_played = games.groupby('homeTeamAbbr').size().reset_index(name='games_played')
# Calculate the total number of games played by each team
team_games_home = games['homeTeamAbbr'].value_counts()
team_games_visitor = games['visitorTeamAbbr'].value_counts()
total_games_per_team = team_games_home.add(team_games_visitor, fill_value=0)
# Merge win counts with total games played to calculate win rate
team_win_rate = team_wins.merge(total_games_played, left_on='homeTeamAbbr', right_on='homeTeamAbbr')
# Calculate win rate using the correct total games played
team_win_rate['games_played'] = team_win_rate['homeTeamAbbr'].map(total_games_per_team)
team_win_rate['win_rate'] = team_win_rate['total_wins'] / team_win_rate['games_played']
# Sort teams by corrected win rate in descending order
team_win_rate_sorted = team_win_rate[['homeTeamAbbr', 'win_rate']].sort_values(by='win_rate', ascending=True)
fig = px.bar(
team_win_rate_sorted,
y='homeTeamAbbr',
x="win_rate",
orientation='h',
title='Winning Rate by teams',
height=900,
width=800,
color_discrete_sequence=px.colors.qualitative.Prism,
)
fig.show()
As we can see, the Philadelphia Eagles (PHI) have the highest win rate of 100%, indicating they won all their games in the data provided. In contrast, the Carolina Panthers (CAR) have the lowest win rate of 22.2%.
Win Rate and Tackling Performance of Teams
As we review the charts above, we can draw the intutive assumption on the lack of strong linear relations between the tackling performance of the teams and their win rates in the games.
The Philadelphia Eagles (PHI) who hold the highest win rate (1.0) in the season do not demonstrate outstanding tackling performnace (they are in the bottom 5 list of the teams in terms of the tackling performance metrics) In turn, the leaders of the tackling perfomrnace rating list do not demonstrate the high win rate (the win rate of Atlanta Falcons (ATL) is 0.4444, and the win rate of the Seattle Eagles (SEA) is 0.6667) Let's prove this intelligence by a bit of statistical analysis and visualizations. Let's look at the resepctive correlations.
dd = team_performance_sorted.merge(team_win_rate[['homeTeamAbbr', 'win_rate']],
left_on='defensiveTeam',
right_on='homeTeamAbbr')
d = dd[['total_tackles', 'total_assists', 'missed_tackles', 'total_tackles_and_assists', 'win_rate']]
# Compute the correlation matrix for numeric features
corr = d.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, annot=True,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
<Axes: >
We can see that
CONCLUSION
Wise people say, there is yet-to-be-discovered rationale/intelligence behind most of the intuitive anticipations. Reviewing the body of knowledge on tackling success factors for Union Rugby substantiates there are good approaches identified by the research there.
The purpose of this essay is to scrutinize the previous findings as for the tackling range and player-in-tackle evaluation in Union Rugby. Plot the NFL data-driven insights to prove which metrics identified for tackling evaluation in Union Rugby are good for NFL tackle evaluation as well.
The current research can be extended for sure.